{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0f58f98a",
   "metadata": {},
   "source": [
    "## Pandas-离散化和面元划分"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5bd6b4a3",
   "metadata": {},
   "source": [
    "内容介绍:\n",
    " * 离散化：通过数据处理，得到数据在整体中的相对大小\n",
    " * 面元划分：分阶段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "567447e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9a6bc27e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    58\n",
      "1    26\n",
      "2    23\n",
      "3    35\n",
      "4    41\n",
      "5    22\n",
      "6    42\n",
      "7    26\n",
      "8    40\n",
      "9    32\n",
      "dtype: int32\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>B</th>\n",
       "      <th>A</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>-2</td>\n",
       "      <td>8</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>-9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   B  A  C\n",
       "d -2  8 -1\n",
       "b  5  6  6\n",
       "c  6  0 -1\n",
       "a  2  5 -9"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 示例数据\n",
    "s0 = pd.Series(np.random.randint(18,70,size=(10)))\n",
    "print(s0)\n",
    "df0 = pd.DataFrame(np.random.randint(-9,9,size=(4,3)),index=['d','b','c','a'],columns=['B','A','C'])\n",
    "df0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b1ff2c53",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([21, 83, 40, 48, 80, 62, 61, 70, 87, 82, 58, 42, 72, 25, 32, 27, 91,\n",
       "       68, 29, 51])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = np.random.randint(18,100,size=(20))\n",
    "arr"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1774b25e",
   "metadata": {},
   "source": [
    "### 1.1面元划分-cut函数\n",
    "\n",
    "面元划分及相关操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "dd8e4955",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分数据的阶段\n",
    "bins = [18,25,35,60,100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c938a00b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(18, 25], (60, 100], (35, 60], (35, 60], (60, 100], ..., (25, 35], (60, 100], (60, 100], (25, 35], (35, 60]]\n",
       "Length: 20\n",
       "Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#获取对应数据的面元。理解为返回的是表示不同面元名称的字符串。\n",
    "cats = pd.cut(arr,bins)\n",
    "cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "f019976d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#针对Series不能显示此属性。python数组和numpy ndarry数组可以使用此属性。\n",
    "cats.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "ea603ea8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 3, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 0, 1, 1, 3, 3, 1, 2],\n",
       "      dtype=int8)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#显示数组对应的阶段\n",
    "cats.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "040297d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60, 100]    10\n",
       "(35, 60]      5\n",
       "(25, 35]      3\n",
       "(18, 25]      2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 分类数据统计\n",
    "pd.value_counts(cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "e3c27c8c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[18, 25), [60, 100), [35, 60), [35, 60), [60, 100), ..., [25, 35), [60, 100), [60, 100), [25, 35), [35, 60)]\n",
       "Length: 20\n",
       "Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#默认的左开右闭。修改这种方式如下：\n",
    "cat2 = pd.cut(arr,bins,right=False)\n",
    "cat2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "ab2171f7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[60, 100)    10\n",
       "[35, 60)      5\n",
       "[25, 35)      4\n",
       "[18, 25)      1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 分类数据统计\n",
    "pd.value_counts(cat2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "251f1522",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['青少年', '老年', '中年', '中年', '老年', ..., '青年', '老年', '老年', '青年', '中年']\n",
       "Length: 20\n",
       "Categories (4, object): ['青少年' < '青年' < '中年' < '老年']"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#自定义面元名称。让面元(数据分区)有可理解的意义。\n",
    "names=['青少年','青年','中年','老年']\n",
    "cat3 = pd.cut(arr,bins,labels=names)\n",
    "cat3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "8837952d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "老年     10\n",
       "中年      5\n",
       "青年      3\n",
       "青少年     2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#统计数据出现时，按照标签显示各阶段的数量\n",
    "pd.value_counts(cat3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "c5ee22e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.87767343, 0.04761865, 0.83973173, 0.36743851, 0.57763083,\n",
       "       0.48788535, 0.51063946, 0.60210982, 0.82046727, 0.36311687,\n",
       "       0.18892736, 0.04363943, 0.79841758, 0.90671213, 0.15947717,\n",
       "       0.77079391, 0.58197522, 0.38258206, 0.9495849 , 0.31636671])"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#不输入设定的阶段值，按照数量划分阶段\n",
    "arr2 = np.random.rand(20)\n",
    "arr2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "6033aa04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0.72, 0.95], (0.043, 0.27], (0.72, 0.95], (0.27, 0.5], (0.5, 0.72], ..., (0.72, 0.95], (0.5, 0.72], (0.27, 0.5], (0.72, 0.95], (0.27, 0.5]]\n",
       "Length: 20\n",
       "Categories (4, interval[float64, right]): [(0.043, 0.27] < (0.27, 0.5] < (0.5, 0.72] < (0.72, 0.95]]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#在cat函数中传入阶段数量，函数会根据传入的数据自动划分。precision=2是返回结果为两位小数。\n",
    "# 数值所在的区间四等分。即数值区间在0-100，四块均等分为25.\n",
    "cat4 = pd.cut(arr2,4,precision=2)\n",
    "cat4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "936eb1fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.72, 0.95]     7\n",
       "(0.27, 0.5]      5\n",
       "(0.043, 0.27]    4\n",
       "(0.5, 0.72]      4\n",
       "dtype: int64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cat4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a520a18",
   "metadata": {},
   "source": [
    "### 1.2面元划分qcut函数\n",
    "\n",
    "得到每个面元数据量相等的划分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "0f71916e",
   "metadata": {},
   "outputs": [],
   "source": [
    "arr12 = np.random.rand(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "af68556c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(-0.00042200000000000007, 0.268], (0.268, 0.514], (-0.00042200000000000007, 0.268], (0.514, 0.752], (-0.00042200000000000007, 0.268], ..., (0.514, 0.752], (0.268, 0.514], (0.268, 0.514], (0.514, 0.752], (0.514, 0.752]]\n",
       "Length: 1000\n",
       "Categories (4, interval[float64, right]): [(-0.00042200000000000007, 0.268] < (0.268, 0.514] < (0.514, 0.752] < (0.752, 0.999]]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 可以实现按照数量的四等分。即总体为100个数据，那么每份分为25个数据。\n",
    "cat12 = pd.qcut(arr12,4)\n",
    "cat12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "83bf1f74",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(-0.00042200000000000007, 0.268]    250\n",
       "(0.268, 0.514]                      250\n",
       "(0.514, 0.752]                      250\n",
       "(0.752, 0.999]                      250\n",
       "dtype: int64"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cat12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "dc3aeed8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(-0.00042200000000000007, 0.113], (0.113, 0.514], (0.113, 0.514], (0.514, 0.896], (0.113, 0.514], ..., (0.514, 0.896], (0.113, 0.514], (0.113, 0.514], (0.514, 0.896], (0.514, 0.896]]\n",
       "Length: 1000\n",
       "Categories (3, interval[float64, right]): [(-0.00042200000000000007, 0.113] < (0.113, 0.514] < (0.514, 0.896]]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#qcut自己设定范围的划分\n",
    "cat121 = pd.qcut(arr12,[0,0.1,0.5,0.9])\n",
    "cat121"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "ef7dc3ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.113, 0.514]                      400\n",
       "(0.514, 0.896]                      400\n",
       "(-0.00042200000000000007, 0.113]    100\n",
       "dtype: int64"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cat121)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b6c5188",
   "metadata": {},
   "source": [
    "### 2.检测和过滤异常值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "87ea49d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.951637</td>\n",
       "      <td>-0.009127</td>\n",
       "      <td>-0.294744</td>\n",
       "      <td>-0.796159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.293682</td>\n",
       "      <td>-0.001784</td>\n",
       "      <td>0.001199</td>\n",
       "      <td>-0.009707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.441607</td>\n",
       "      <td>0.512919</td>\n",
       "      <td>0.979075</td>\n",
       "      <td>0.323896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.159724</td>\n",
       "      <td>0.473924</td>\n",
       "      <td>-2.286506</td>\n",
       "      <td>0.765562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.671371</td>\n",
       "      <td>-1.180175</td>\n",
       "      <td>-1.133407</td>\n",
       "      <td>1.856926</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>-0.622098</td>\n",
       "      <td>-0.497169</td>\n",
       "      <td>-0.073101</td>\n",
       "      <td>1.388692</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>-0.062709</td>\n",
       "      <td>0.337784</td>\n",
       "      <td>-1.364190</td>\n",
       "      <td>0.628116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>-0.107963</td>\n",
       "      <td>-0.130047</td>\n",
       "      <td>0.497731</td>\n",
       "      <td>-1.143566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1.219360</td>\n",
       "      <td>-0.483168</td>\n",
       "      <td>-0.993379</td>\n",
       "      <td>-1.207324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>0.917210</td>\n",
       "      <td>2.302226</td>\n",
       "      <td>-0.545264</td>\n",
       "      <td>0.723568</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            0         1         2         3\n",
       "0    1.951637 -0.009127 -0.294744 -0.796159\n",
       "1    1.293682 -0.001784  0.001199 -0.009707\n",
       "2    0.441607  0.512919  0.979075  0.323896\n",
       "3    0.159724  0.473924 -2.286506  0.765562\n",
       "4    1.671371 -1.180175 -1.133407  1.856926\n",
       "..        ...       ...       ...       ...\n",
       "995 -0.622098 -0.497169 -0.073101  1.388692\n",
       "996 -0.062709  0.337784 -1.364190  0.628116\n",
       "997 -0.107963 -0.130047  0.497731 -1.143566\n",
       "998  1.219360 -0.483168 -0.993379 -1.207324\n",
       "999  0.917210  2.302226 -0.545264  0.723568\n",
       "\n",
       "[1000 rows x 4 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame(np.random.randn(1000,4))\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "41582f9e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         0      1      2      3\n",
       "0    False  False  False  False\n",
       "1    False  False  False  False\n",
       "2    False  False  False  False\n",
       "3    False  False   True  False\n",
       "4    False  False  False  False\n",
       "..     ...    ...    ...    ...\n",
       "995  False  False  False  False\n",
       "996  False  False  False  False\n",
       "997  False  False  False  False\n",
       "998  False  False  False  False\n",
       "999  False   True  False  False\n",
       "\n",
       "[1000 rows x 4 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.abs(df2)>2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "75da77ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.159724</td>\n",
       "      <td>0.473924</td>\n",
       "      <td>-2.286506</td>\n",
       "      <td>0.765562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-1.190548</td>\n",
       "      <td>1.514092</td>\n",
       "      <td>0.635842</td>\n",
       "      <td>2.421032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>-0.657061</td>\n",
       "      <td>2.075509</td>\n",
       "      <td>0.521587</td>\n",
       "      <td>0.744975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>-0.012271</td>\n",
       "      <td>0.832700</td>\n",
       "      <td>0.552373</td>\n",
       "      <td>3.003729</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>-1.670941</td>\n",
       "      <td>-2.461845</td>\n",
       "      <td>-1.258139</td>\n",
       "      <td>-0.935025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>986</th>\n",
       "      <td>-0.427006</td>\n",
       "      <td>2.654343</td>\n",
       "      <td>0.478391</td>\n",
       "      <td>0.624780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>989</th>\n",
       "      <td>-2.219238</td>\n",
       "      <td>-0.011853</td>\n",
       "      <td>-0.025186</td>\n",
       "      <td>0.817374</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>993</th>\n",
       "      <td>0.034468</td>\n",
       "      <td>1.213977</td>\n",
       "      <td>0.792636</td>\n",
       "      <td>2.092112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>994</th>\n",
       "      <td>0.404317</td>\n",
       "      <td>0.186653</td>\n",
       "      <td>-0.732448</td>\n",
       "      <td>2.489930</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>0.917210</td>\n",
       "      <td>2.302226</td>\n",
       "      <td>-0.545264</td>\n",
       "      <td>0.723568</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>177 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            0         1         2         3\n",
       "3    0.159724  0.473924 -2.286506  0.765562\n",
       "12  -1.190548  1.514092  0.635842  2.421032\n",
       "14  -0.657061  2.075509  0.521587  0.744975\n",
       "17  -0.012271  0.832700  0.552373  3.003729\n",
       "36  -1.670941 -2.461845 -1.258139 -0.935025\n",
       "..        ...       ...       ...       ...\n",
       "986 -0.427006  2.654343  0.478391  0.624780\n",
       "989 -2.219238 -0.011853 -0.025186  0.817374\n",
       "993  0.034468  1.213977  0.792636  2.092112\n",
       "994  0.404317  0.186653 -0.732448  2.489930\n",
       "999  0.917210  2.302226 -0.545264  0.723568\n",
       "\n",
       "[177 rows x 4 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#找到绝对值大于三的数值。筛选异常值。\n",
    "df2[(np.abs(df2)>2).any(1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a8efdaf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#针对某些异常值的设置\n",
    "df2[(np.abs(df2)>3)] = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9aa00cfe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.037085</td>\n",
       "      <td>0.035339</td>\n",
       "      <td>0.020592</td>\n",
       "      <td>0.079983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.014670</td>\n",
       "      <td>0.994967</td>\n",
       "      <td>0.952063</td>\n",
       "      <td>0.996232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-2.641156</td>\n",
       "      <td>-2.617328</td>\n",
       "      <td>-2.682684</td>\n",
       "      <td>-2.768796</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-0.685504</td>\n",
       "      <td>-0.604792</td>\n",
       "      <td>-0.649864</td>\n",
       "      <td>-0.584327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.030673</td>\n",
       "      <td>-0.009623</td>\n",
       "      <td>0.028430</td>\n",
       "      <td>0.049411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.707349</td>\n",
       "      <td>0.658782</td>\n",
       "      <td>0.635780</td>\n",
       "      <td>0.733608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 0            1            2            3\n",
       "count  1000.000000  1000.000000  1000.000000  1000.000000\n",
       "mean      0.037085     0.035339     0.020592     0.079983\n",
       "std       1.014670     0.994967     0.952063     0.996232\n",
       "min      -2.641156    -2.617328    -2.682684    -2.768796\n",
       "25%      -0.685504    -0.604792    -0.649864    -0.584327\n",
       "50%       0.030673    -0.009623     0.028430     0.049411\n",
       "75%       0.707349     0.658782     0.635780     0.733608\n",
       "max       3.000000     3.000000     3.000000     3.000000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d773a63",
   "metadata": {},
   "source": [
    "### 3.排列和随即采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8e2bb6e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "0   0   1   2   3\n",
       "1   4   5   6   7\n",
       "2   8   9  10  11\n",
       "3  12  13  14  15\n",
       "4  16  17  18  19"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = pd.DataFrame(np.arange(5*4).reshape(5,4))\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9d3c54fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 0, 3, 4, 1])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#获取包含随机数的数组\n",
    "sam = np.random.permutation(5)\n",
    "sam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bdfcc74d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "2   8   9  10  11\n",
       "0   0   1   2   3\n",
       "3  12  13  14  15\n",
       "4  16  17  18  19\n",
       "1   4   5   6   7"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#使用随即数组，获取行随机排列的数据\n",
    "#随即数组的生成是根据np.random.permutation函数进行\n",
    "df3.take(sam)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "bf2bc289",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "0   0   1   2   3\n",
       "2   8   9  10  11\n",
       "4  16  17  18  19"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#随机选取数行\n",
    "df3.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "9ef120b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1   2   3\n",
       "2  8  9  10  11"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#随机选取数行\n",
    "df3.sample(n=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "046d6a30",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    5\n",
       "1    8\n",
       "2    1\n",
       "3    3\n",
       "4    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ch = pd.Series([5,8,1,3,0])\n",
    "ch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "08ce1d20",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    1\n",
       "0    5\n",
       "3    3\n",
       "4    0\n",
       "4    0\n",
       "3    3\n",
       "1    8\n",
       "3    3\n",
       "1    8\n",
       "2    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#选取大于数据量本身的随机数据\n",
    "ch.sample(n=10,replace=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
